Setup

library(tidyverse)
library(lubridate)
library(leaflet)
incidents <- feather::read_feather("incidents.feather")
drivers <- feather::read_feather("drivers.feather")
non_motorists <- feather::read_feather("non_motorists.feather")
single_color <- "#3182bd"
basic_theme <- theme(
  plot.title = element_text(size = 24, vjust = .5, hjust = .5),
  axis.title = element_text(size = 16, hjust = .5, vjust = .5),
  panel.background = element_rect(fill = "white"),
  legend.position = "none",
  panel.grid.major.y = element_line(color = "grey75", size = .1, linetype = "solid")
)

Incidents

This is a quick overview of the structure of the Incidents table.

skimr::skim_to_wide(incidents) %>% DT::datatable()
## Warning: 'skimr::skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")

There have been 146 fatal crashes out of 58131 recorded incidents since this data set began in 2015.

incidents %>%
  group_by(acrs_report_type) %>%
  count() %>%
  knitr::kable()
acrs_report_type n
Fatal Crash 146
Injury Crash 20787
Property Damage Crash 37198

Location of Incidents

This interactive map shows the locations of the incidents in the data set. The 112 incidents that do not fall within the limits of the county are colored red.

pal <- colorFactor(c("blue", "red"), domain = c(0, 1))
incidents %>%
  leaflet() %>%
  addTiles() %>%
  addCircleMarkers(lng = ~longitude, lat = ~latitude, radius = 3, stroke = TRUE, weight = 2, opacity = 1, color = ~ pal(not_in_county))

Time variables

Crashes peak around 5:00 PM, which lines up with the evening rush hour. It seems that the report times are often rounded to the nearest fifteen or five minute round number.

incidents %>%
  select(incident_hour, incident_minute) %>%
  mutate(incident_time = 60 * incident_hour + incident_minute) %>%
  ggplot() +
  stat_count(aes(x = incident_time), fill = single_color) +
  scale_x_continuous(
    name = "Time",
    breaks = seq.int(from = 0, to = 24 * 60, by = 120),
    labels = c(paste0(seq.int(from = 0, to = 22, by = 2), ":00"), "0:00")
  ) +
  ggtitle("Crashes by Time of Day") +
  basic_theme

Crashes are slightly less frequent on the weekends than on weekdays.

incidents %>%
  select(incident_weekday) %>%
  ggplot() +
  stat_count(aes(x = incident_weekday), fill = single_color) +
  scale_x_continuous(
    name = "Day of Week",
    breaks = 1:7,
    labels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thurday", "Friday", "Saturday")
  ) +
  ggtitle("Crashes by Day of Week") +
  basic_theme

The number of crashes seems to be fairly constant over time.

incidents %>%
  select(incident_year) %>%
  ggplot() +
  stat_count(aes(x = incident_year), fill = single_color) +
  ggtitle("Crashes by Year") +
  basic_theme

This table contains the number of crashes recorded on each date. 2019 is excluded because the year hasn’t concluded and the counts for dates that already have happened would be inflated. The dates with the most crashes tend to come from the last four months of the year, but holidays including Christmas and New Years are among the days with the fewest crashes.

incidents %>%
  filter(incident_year < 2019) %>%
  select(incident_month, incident_day) %>%
  mutate(incident_date = paste0(incident_month, "/", incident_day)) %>%
  count(incident_date) %>%
  arrange(desc(n)) %>%
  rename(Date = incident_date, `Number of Crashes` = n) %>%
  DT::datatable()

Weather

Most crashes come under clear weather conditions.

incidents %>%
  group_by(weather) %>%
  count() %>%
  arrange(desc(n)) %>%
  knitr::kable()
weather n
CLEAR 38224
RAINING 7127
CLOUDY 6103
N/A 4894
SNOW 676
UNKNOWN 361
FOGGY 233
WINTRY MIX 171
OTHER 140
SLEET 77
SEVERE WINDS 64
BLOWING SNOW 54
BLOWING SAND SOIL DIRT 7

Substance Abuse

Crashes involving alcohol and illegal drugs constitute a small minority of the total incidents.

incidents %>%
  mutate(alcohol = driver_substance_abuse_alcohol_present | driver_substance_abuse_alcohol_contributed) %>%
  count(alcohol) %>%
  knitr::kable()
alcohol n
FALSE 54857
TRUE 3274
incidents %>%
  count(driver_substance_abuse_alcohol_contributed) %>%
  knitr::kable()
driver_substance_abuse_alcohol_contributed n
0 57250
1 881
incidents %>%
  count(driver_substance_abuse_alcohol_present) %>%
  knitr::kable()
driver_substance_abuse_alcohol_present n
0 55735
1 2396
incidents %>%
  mutate(illegal_drug = driver_substance_abuse_illegal_drug_present | driver_substance_abuse_illegal_drug_contributed) %>%
  count(illegal_drug) %>%
  knitr::kable()
illegal_drug n
FALSE 57909
TRUE 222
incidents %>%
  count(driver_substance_abuse_illegal_drug_contributed) %>%
  knitr::kable()
driver_substance_abuse_illegal_drug_contributed n
0 58084
1 47
incidents %>%
  count(driver_substance_abuse_illegal_drug_present) %>%
  knitr::kable()
driver_substance_abuse_illegal_drug_present n
0 57956
1 175

Alcohol and Time

Crashes involving alcohol do not follow the general time of day trends of crashes as a whole. The late night and early morning hours see more crashes than the daylight hours.

incidents %>%
  filter(driver_substance_abuse_alcohol_present | driver_substance_abuse_alcohol_contributed) %>%
  select(incident_hour, incident_minute) %>%
  mutate(incident_time = 60 * incident_hour + incident_minute) %>%
  ggplot() +
  stat_count(aes(x = incident_time), fill = single_color) +
  scale_x_continuous(
    name = "Time",
    breaks = seq.int(from = 0, to = 24 * 60, by = 120),
    labels = paste0(seq.int(from = 0, to = 24, by = 2), ":00")
  ) +
  ggtitle("Crashes Involving Alcohol by Time of Day") +
  basic_theme

Weekends see more alchol related crashes than weekdays, the opposite of the trend for all crashes.

incidents %>%
  filter(driver_substance_abuse_alcohol_present | driver_substance_abuse_alcohol_contributed) %>%
  select(incident_weekday) %>%
  ggplot() +
  stat_count(aes(x = incident_weekday), fill = single_color) +
  scale_x_continuous(
    name = "Day of Week",
    breaks = 1:7,
    labels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thurday", "Friday", "Saturday")
  ) +
  ggtitle("Crashes Involving Alcohol by Day of Week") +
  basic_theme

Drivers

Overview of the drivers table structure.

skimr::skim_to_wide(drivers) %>% DT::datatable()
## Warning: 'skimr::skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")

Driver Injuries

81.33% of drivers did not sustain any reported injuries. 0.91% of Drivers suffered serious or fatal injuries.

ordered_levels <- drivers %>%
  count(injury_severity) %>%
  arrange(desc(n)) %>%
  pull(injury_severity)
drivers %>%
  mutate(`Injury Severity` = factor(injury_severity, ordered_levels)) %>%
  ggplot() +
  stat_count(aes(x = `Injury Severity`), fill = single_color) +
  ggtitle("Severity of Driver Injuries") +
  basic_theme

drivers %>%
  count(injury_severity) %>%
  arrange(n) %>%
  knitr::kable()
injury_severity n
FATAL INJURY 71
SUSPECTED SERIOUS INJURY 876
SUSPECTED MINOR INJURY 7763
POSSIBLE INJURY 10647
NO APPARENT INJURY 84334

Vehichle Damage

Most reported accidents involved at least some vehicle damage. Just 3.73% of drivers reported no vehicle damage.

ordered_levels <- drivers %>%
  count(vehicle_damage_extent) %>%
  arrange(desc(n)) %>%
  pull(vehicle_damage_extent)
drivers %>%
  mutate(`Vehicle Damage` = factor(vehicle_damage_extent, ordered_levels)) %>%
  ggplot() +
  stat_count(aes(x = `Vehicle Damage`), fill = single_color) +
  ggtitle("Severity of Vehicle Damage") +
  basic_theme

drivers %>%
  count(vehicle_damage_extent) %>%
  arrange(desc(n)) %>%
  knitr::kable()
vehicle_damage_extent n
DISABLING 36731
FUNCTIONAL 28010
SUPERFICIAL 27217
DESTROYED 4081
NO DAMAGE 3874
UNKNOWN 3531
N/A 189
OTHER 58

Speed Limits

drivers %>%
  ggplot() +
  stat_count(aes(x = speed_limit, fill = ), fill = single_color) +
  ggtitle("Speed Limits") +
  basic_theme

ordered_injury <- drivers %>%
  count(injury_severity) %>%
  arrange(desc(n)) %>%
  pull(injury_severity)
drivers %>%
  mutate(
    `Speed Limit` = factor(c("LOW", "MEDIUM", "HIGH")[1 + findInterval(speed_limit, c(31, 46))], levels = c("LOW", "MEDIUM", "HIGH")),
    `Injury Severity` = factor(injury_severity, ordered_injury)
  ) %>%
  ggplot() +
  stat_count(aes(x = `Injury Severity`, fill = `Speed Limit`)) +
  basic_theme +
  theme(legend.position = c(.9, .9)) +
  ggtitle("Injuries and Speed Limits")

ordered_injury <- drivers %>%
  count(injury_severity) %>%
  arrange(desc(n)) %>%
  pull(injury_severity)
drivers %>%
  mutate(
    speed_limit = factor(c("LOW", "MEDIUM", "HIGH")[1 + findInterval(speed_limit, c(31, 46))], levels = c("LOW", "MEDIUM", "HIGH")),
    injury_severity = factor(injury_severity, ordered_injury)
  ) %>%
  group_by(speed_limit, injury_severity) %>%
  tally() %>%
  rename(`Speed Limit` = speed_limit, `Injury Severity` = injury_severity) %>%
  ggplot() +
  geom_bar(aes(x = `Injury Severity`, weight = n, fill = `Speed Limit`), position = "fill") +
  basic_theme +
  theme(legend.position = "top") +
  ggtitle("Injuries and Speed Limits") +
  ylab("Proportion")

Types of Vehicles

ordered_levels <- drivers %>%
  count(vehicle_body_type) %>%
  arrange(desc(n)) %>%
  pull(vehicle_body_type)
drivers %>%
  mutate(`Body Type` = factor(as.character(fct_collapse(vehicle_body_type, OTHER = ordered_levels[-(1:5)])), c(ordered_levels[1:5], "OTHER"))) %>%
  ggplot() +
  stat_count(aes(x = `Body Type`), fill = single_color) +
  ggtitle("Vehicle Body Types") +
  basic_theme

Non-Motorists

Overview of Non-Motorists table structure

skimr::skim_to_wide(non_motorists) %>% DT::datatable()
## Warning: 'skimr::skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")

Types of Non Motorists

Pedestrians are the most common type of non motorist, followed by bicyclists.

ordered_levels <- non_motorists %>%
  count(pedestrian_type) %>%
  arrange(desc(n)) %>%
  pull(pedestrian_type)
non_motorists %>%
  mutate(pedestrian_type = factor(pedestrian_type, ordered_levels)) %>%
  ggplot() +
  stat_count(aes(x = pedestrian_type), fill = single_color) +
  basic_theme +
  ggtitle("Types of Non-Motorists")

Locations of Incidents Involving Non-Motorists

pal <- colorFactor(c("blue", "red"), domain = c(0, 1))
non_motorists %>%
  leaflet() %>%
  addTiles() %>%
  addCircleMarkers(lng = ~longitude, lat = ~latitude, radius = 3, stroke = TRUE, weight = 2, opacity = 1, color = ~ pal(not_in_county))

Injuries

8.67% of non-motorists did not sustain any reported injuries. 13.12% of non-motorists suffered serious or fatal injuries.

ordered_levels <- non_motorists %>%
  count(injury_severity) %>%
  arrange(desc(n)) %>%
  pull(injury_severity)
non_motorists %>%
  mutate(`Injury Severity` = factor(injury_severity, ordered_levels)) %>%
  ggplot() +
  stat_count(aes(x = `Injury Severity`), fill = single_color) +
  ggtitle("Severity of Non Motorist Injuries") +
  basic_theme

non_motorists %>%
  count(injury_severity) %>%
  arrange(n) %>%
  knitr::kable()
injury_severity n
FATAL INJURY 64
NO APPARENT INJURY 285
SUSPECTED SERIOUS INJURY 367
POSSIBLE INJURY 1082
SUSPECTED MINOR INJURY 1486

Bicycle Helmet Usage

ordered_levels <- non_motorists %>%
  filter(pedestrian_type == "BICYCLIST") %>%
  count(safety_equipment) %>%
  arrange(desc(n)) %>%
  pull(safety_equipment)
non_motorists %>%
  filter(pedestrian_type == "BICYCLIST") %>%
  mutate(`Safety Equipment` = factor(safety_equipment, ordered_levels)) %>%
  ggplot() +
  stat_count(aes(x = `Safety Equipment`), fill = single_color) +
  basic_theme +
  ggtitle("What Kinds of Safety Equiptment Do Bicyclists Use?")

The more severe injuries tend to have lower rates of helmet usage than minor injuries.

ordered_safety <- non_motorists %>%
  filter(pedestrian_type == "BICYCLIST") %>%
  count(safety_equipment) %>%
  arrange(desc(n)) %>%
  pull(safety_equipment)
ordered_injury <- non_motorists %>%
  filter(pedestrian_type == "BICYCLIST") %>%
  count(injury_severity) %>%
  arrange(desc(n)) %>%
  pull(injury_severity)
non_motorists %>%
  filter(pedestrian_type == "BICYCLIST") %>%
  mutate(
    `Safety Equipment` = fct_collapse(safety_equipment, OTHER = ordered_safety[-(1:2)]),
    `Injury Severity` = factor(injury_severity, ordered_injury)
  ) %>%
  ggplot() +
  stat_count(aes(x = `Injury Severity`, fill = `Safety Equipment`)) +
  basic_theme +
  theme(legend.position = c(.9, .9)) +
  ggtitle("Injuries and Helmets")

Consistency Check

fatal_drivers <- drivers %>%
  filter(injury_severity_fatal_injury == 1) %>%
  pull(report_number)
fatal_nonm <- non_motorists %>%
  filter(injury_severity_fatal_injury == 1) %>%
  pull(report_number)
fatal_acrs <- incidents %>%
  filter(acrs_report_type_fatal_crash == 1) %>%
  pull(report_number)
recorded_fatal <- c(fatal_drivers, fatal_nonm)
# vector of fatal reports with no recorded fatality
probs <- fatal_acrs %>% .[!. %in% recorded_fatal]
incidents %>%
  filter(report_number %in% probs) %>%
  DT::datatable(options = list(pageLength = 5))
drivers %>%
  filter(report_number %in% probs) %>%
  DT::datatable(options = list(pageLength = 5))
non_motorists %>%
  filter(report_number %in% probs) %>%
  DT::datatable(options = list(pageLength = 5))
drivers %>%
  group_by(report_number) %>%
  tally() %>%
  arrange(desc(n))
## # A tibble: 57,742 x 2
##    report_number     n
##    <chr>         <int>
##  1 MCP12130045       9
##  2 MCP2667000H       8
##  3 MCP1227000M       7
##  4 MCP15800085       7
##  5 MCP23580027       7
##  6 MCP2513001C       7
##  7 MCP2617006L       7
##  8 MCP2871006C       7
##  9 MCP3140000Y       7
## 10 MCP9130001S       7
## # … with 57,732 more rows
drivers %>%
  count(report_number) %>%
  count(n) %>%
  rename(`Vehicles Involved` = n) %>%
  knitr::kable()
Vehicles Involved nn
1 17909
2 34766
3 4207
4 713
5 119
6 17
7 9
8 1
9 1
non_motorists %>%
  group_by(report_number) %>%
  tally() %>%
  arrange(desc(n))
## # A tibble: 3,122 x 2
##    report_number     n
##    <chr>         <int>
##  1 MCP229800LT       4
##  2 MCP2546002K       4
##  3 DD5603004V        3
##  4 DM8445001H        3
##  5 DM8457000R        3
##  6 DM8463000L        3
##  7 EJ7809000W        3
##  8 EJ7833003W        3
##  9 MCP1438002S       3
## 10 MCP20080044       3
## # … with 3,112 more rows
non_motorists %>%
  count(report_number) %>%
  count(n) %>%
  rename(`non_motorists Involved` = n) %>%
  knitr::kable()
non_motorists Involved nn
1 2985
2 114
3 21
4 2
fatal_rate <- drivers %>%
  select(report_number, speed_limit) %>%
  group_by(report_number) %>%
  summarise(speed_limit = max(speed_limit)) %>%
  full_join(incidents) %>%
  group_by(speed_limit) %>%
  summarise(fatal_prop = mean(acrs_report_type_fatal_crash), count = n(), fatal_crashes = sum(acrs_report_type_fatal_crash))
## Joining, by = "report_number"
fatal_rate %>%
  filter(!is.na(speed_limit)) %>%
  ggplot() + geom_line(aes(x = speed_limit, y = fatal_prop))

temp <- fatal_rate %>% filter(speed_limit > 25)
prop.test(temp$fatal_crashes, temp$count)
## Warning in prop.test(temp$fatal_crashes, temp$count): Chi-squared approximation
## may be incorrect
## 
##  9-sample test for equality of proportions without continuity
##  correction
## 
## data:  temp$fatal_crashes out of temp$count
## X-squared = 53.003, df = 8, p-value = 1.077e-08
## alternative hypothesis: two.sided
## sample estimates:
##      prop 1      prop 2      prop 3      prop 4      prop 5      prop 6 
## 0.002949474 0.001816836 0.004212744 0.003819527 0.011564212 0.000000000 
##      prop 7      prop 8      prop 9 
## 0.000000000 0.000000000 0.000000000
temp <- fatal_rate %>%
  filter(speed_limit > 25) %>%
  filter(speed_limit != 50)
prop.test(temp$fatal_crashes, temp$count)
## Warning in prop.test(temp$fatal_crashes, temp$count): Chi-squared approximation
## may be incorrect
## 
##  8-sample test for equality of proportions without continuity
##  correction
## 
## data:  temp$fatal_crashes out of temp$count
## X-squared = 17.974, df = 7, p-value = 0.01209
## alternative hypothesis: two.sided
## sample estimates:
##      prop 1      prop 2      prop 3      prop 4      prop 5      prop 6 
## 0.002949474 0.001816836 0.004212744 0.003819527 0.000000000 0.000000000 
##      prop 7      prop 8 
## 0.000000000 0.000000000